In [ ]:
import numpy as np, sklearn as sk, pandas as pd
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
import time as tm, os, regex as re
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
DATAPATH = os.path.realpath( os.path.join( ".", "data", "imdb" ) )
Install necessary NL modules.
In [ ]:
import nltk
assert( nltk.download( [ "stopwords", "wordnet", "wordnet_ic", "punkt" ] ) )
Load data for Natural Language processing.
In [ ]:
from nltk.corpus import stopwords as nl_sw
import nltk.data
english_stopwords = set( nl_sw.words( "english" ) )
english_tokenizer = nltk.data.load( "tokenizers/punkt/english.pickle" )
Load both labelled and unlabelled train datasets.
In [ ]:
# Read data from files
unlabelled_train_data = pd.read_csv( os.path.join( DATAPATH, 'unlabeledTrainData.tsv' ),
sep = "\t", header = 0, quoting = 3, encoding="utf-8" )
labelled_train_data = pd.read_csv( os.path.join( DATAPATH, 'labeledTrainData.tsv' ),
sep = "\t", header = 0, quoting = 3, encoding="utf-8" )
Define preprocessors
In [ ]:
def __wordlist( text, stops = None ) :
letters_only = re.sub("[^a-zA-Z]", " ", bs( text ).get_text( ) )
words = letters_only.lower( ).split()
if stops is not None :
return [ w for w in words if not w in stops ]
return words
Cut reviews into sentences.
In [ ]:
def __sentences( text, tokenizer = None, stops = None ):
raw_sentences = tokenizer.tokenize( text.strip( ) )
return [ __wordlist( s, stops = stops )
for s in raw_sentences if len( s ) > 0 ]
Cut each review into sentences.
In [ ]:
train_sentences = list( )
if not os.path.exists( os.path.join( DATAPATH, 'imdb_review_train_sentences.txt' ) ) :
print "Cutting reviews into sentences."
## Begin time
tock = tm.time( )
## Convert reviews into sentences
print "Labelled train dataset..."
for r in labelled_train_data.review :
train_sentences.extend( __sentences( r, english_tokenizer, stops = None ) )
print "Unabelled train dataset..."
for r in unlabelled_train_data.review :
train_sentences.extend( __sentences( r, english_tokenizer, stops = None ) )
## End time
tick = tm.time( )
## Report
print "Preprocessing took %.1f sec." % ( tick - tock, )
print "Caching..."
## Store the processed sentences in a UTF-8 text file
with open( os.path.join( DATAPATH, 'imdb_review_train_sentences.txt' ), 'wb' ) as cache :
cache.writelines( "\t".join( s ).encode( 'utf8' ) + "\n" for s in train_sentences )
## Final time
tock = tm.time( )
else :
print "Loading cached sentences..."
## Begin time
tick = tm.time( )
with open( os.path.join( DATAPATH, 'imdb_review_train_sentences.txt' ), 'rb' ) as cache :
train_sentences.extend( l.decode( 'utf8' ).strip( ).split( '\t' ) for l in cache.readlines( ) )
## End time
tock = tm.time( )
## Report
print "Loaded sentences in %.1f sec." % ( tock - tick, )
Get the vector representation of words using word2vec in gensim module.
In [ ]:
import gensim.models, time as tm
# Initialize the model
model = gensim.models.Word2Vec(
workers = 7, # Number of threads to run in parallel
size = 300, # Word vector dimensionality
min_count = 40, # Minimum word count for pruning the internal dictionary
window = 10, # Context sindow size
sample = 1e-3 ) # Downsample setting for frequent words
model_cache_name = "W2V_%d-%d-%d.mdl" % ( model.layer1_size, model.min_count, model.window , )
if not os.path.exists( os.path.join( DATAPATH, model_cache_name ) ) :
## Begin time
tock = tm.time( )
## First pass -- building the vocabulary
model.build_vocab( train_sentences )
## Second pass -- training the neural net
model.train( train_sentences )
## End time
tick = tm.time( )
## Report
print "Training word2vec took %.1f sec." % ( tick - tock, )
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims( replace = True )
# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model.save( os.path.join( DATAPATH, model_cache_name ) )
## End time
tock = tm.time( )
else :
## Begin time
tick = tm.time( )
## Load the model from the blob
model = gensim.models.Word2Vec.load( os.path.join( DATAPATH, model_cache_name ) )
## End time
tock = tm.time( )
## Report
print "Model loaded in %.1f sec." % ( tock - tick, )
Let's see how well the trained model performs over the google analogical proportions dataset.
In [ ]:
print "Testing Google's analogical proportions..."
tick = tm.time( )
## test model accuracy against the Google dataset
google_dataset_accuracy = model.accuracy( os.path.join( DATAPATH, 'questions-words.txt' ) )
tock = tm.time( )
print "Completed in %.1f sec." % ( tock - tick, )
In [ ]:
print "####\tCORRECT\tTOTAL\tSECTION"
for i, s in enumerate( google_dataset_accuracy, 0 ) :
total = len( s['correct'] ) + len( s['incorrect'] )
print "%4d\t%4d\t%5d\t%s." % ( i, len( s['correct'] ), total, s['section'], )
The reason why thiese results are so poor is that the reviews database is not a language corpus, it is does not provide enough coverage of the natural language variety (English), it is topically biased, and, since it is mainly user generated content, it is stylistically more colloquial.
Let's see how exactly the IMDB reviews fails as a corpus for the Google's analogical proportion test : word A is to B as C is to D
In [ ]:
for A, B, C, expected in google_dataset_accuracy[1]["incorrect"][:10] :
predictions = [ p for p, s in model.most_similar( positive=[ B, C ], negative=[ A ], topn = 5 ) ]
if expected not in predictions :
print "%s - %s : %s - %s " % ( A,B,C, expected, ) , predictions
else :
pass
Not unexpectedly, the reviews do not cover geographical terms relations well enough.
In [ ]:
model.most_similar( "king" )
The most similar terms to "king" are the name of the Dinsey animation "Lion King", a fictional beast "King Kong" and the author of many a horror and supertnatural fiction novel "Stephen King". This document set is no good for general language semantics testing. Aladdin is no king.
In [ ]:
model.most_similar( "gothic" )
One would expect to see at least one reference to architecutral style, but the reviews are mostly focused on genres and movies.
In [ ]:
print "west\t - ", [ d for d, s in model.most_similar( [ "south", "west" ], [ "north" ], topn = 5 ) ]
print "east\t - ", [ d for d, s in model.most_similar( [ "south", "east" ], [ "north" ], topn = 5 ) ]
print "north\t - ", [ d for d, s in model.most_similar( [ "west", "north" ], [ "east" ], topn = 5 ) ]
print "south\t - ", [ d for d, s in model.most_similar( [ "west", "south" ], [ "east" ], topn = 5 ) ]
The model, trained on IMDB reviews, cannot correctly identify three cardinal directions out of 4.
In [ ]:
print model.doesnt_match("sea ocean lake river".split())
In [ ]:
print model.doesnt_match( "good bad ugly horrible".split( ) )
In [ ]:
print model.most_similar( positive=['woman', 'king'], negative=['man'], topn=1)
print model.doesnt_match("breakfast cereal dinner milk".split())
print model.similarity('woman', 'man')
In [ ]:
vocab = np.asarray( model.vocab.keys(), dtype = np.str)
# vocab[ np.argmax( np.abs(model.syn0), axis = 0 ) ]
In [ ]:
vocab
Implement a lemmatizer based on WordNet relationship data and sentences of reivews.
In [ ]:
wnl = nltk.WordNetLemmatizer( )
def __lemmatize( text, lemmatizer, tokenizer ) :
processed_text = re.sub( "\"", "", bs( text ).get_text( ) )
raw_sentences = tokenizer.tokenize( processed_text.strip( ).lower( ) )
return [ lemmatizer.lemmatize( w )
for s in raw_sentences for w in re.sub( r"\p{Punctuation}+", " ", s ).split( ) ]
Collect lemmatized reviews into one "corpus"
In [ ]:
lemmatized_reviews = list( )
print "Cutting reviews into sentences."
## Begin time
tock = tm.time( )
## Convert reviews into sentences
print "Labelled train dataset..."
for r in labelled_train_data.review :
lemmatized_reviews.append( __lemmatize( r, wnl, english_tokenizer ) )
print "Unabelled train dataset..."
for r in unlabelled_train_data.review :
lemmatized_reviews.append( __lemmatize( r, wnl, english_tokenizer ) )
## End time
tick = tm.time( )
## Report
print "Preprocessing took %.1f sec." % ( tick - tock, )
Import gensim toolkit
In [ ]:
from gensim import corpora, models, similarities
Construct the term vocabulary
In [ ]:
if not os.path.exists( os.path.join( DATAPATH, 'LDA_vocabulary.dct' ) ) :
vocabulary = corpora.Dictionary( lemmatized_reviews )
Ditch too frequent or too rare terms.
In [ ]:
if not os.path.exists( os.path.join( DATAPATH, 'LDA_vocabulary.dct' ) ) :
vocabulary.filter_extremes( no_below = 5, no_above = 0.5, keep_n = None )
vocabulary.save( os.path.join( DATAPATH, 'LDA_vocabulary.dct' ) )
In [ ]:
vocabulary
Transform the document words into word ID vectors: bag-of-terms.
In [ ]:
corpus = [ vocabulary.doc2bow( text ) for text in lemmatized_reviews ]
corpora.MmCorpus.serialize( os.path.join( DATAPATH, 'LDA_bow.mm' ), corpus ) # store on disc
Train a Latent Dirichlet Allocation model.
In [ ]:
## Begin time
tick = tm.time( )
## Fit the LDA model
model = models.ldamodel.LdaModel(
corpus, id2word = vocabulary, num_topics = 100, chunksize = 50, update_every = 1, passes = 2 )
## End time
tock = tm.time()
In [ ]:
print "Estimating LDA model took %.3f sec."%( tock - tick, )
What is the LDA model? Basically the setting is as follows:
Formally, the model is as follows: given a set of documents $D$ and words $W$
where $\text{Dir}_F(\alpha)$ is the Dirichlet Distribution on simplex $S^\circ_F = \{ x\in [0,1]^F\big| \sum_{i\in F} x_i = 1 \}$ with parameter $\alpha > 0$ and density for any $x\in [0,1]^F$ $$ \text{Dir}_F\bigl( x;\alpha \bigr) = \frac{\prod_{i\in F} \Gamma(\alpha_i)}{\Gamma(\sum_{i\in F} \alpha_i)} 1_{x\in S^\circ_F } \prod_{i\in F} x_i^{\alpha_i-1}\,, $$
and $\text{Cat}_F(\theta)$ is the categorical distribution on $F$ with parameter $\theta$ and density $$ \text{Cat}_F(x;\theta) = \theta_x = \prod_{i\in F} \theta_i^{1_{x=i}}\,, $$ which is the distribution of a discrete random varaible with values in $F$.
Let $w_d = \bigl( w_{di} )_{i\in d}$ for any $d\in D$. Then the log-likelihood of the model is $$ L( D |\alpha, \beta ) = \log \prod_{d\in D} p_d( w_d |\alpha, \beta ) = \sum_{d\in D} \sum_{i\in d} \log p_d( w_{di} |\alpha, \beta )\,, $$ where $$ p_d\bigl( w | \alpha, \beta \bigr) = \mathbb{E}_{(\theta,\phi) \sim \text{Dir}_W(\alpha) \times \text{Dir}_T(\beta)} p_d\bigl( w, \theta, \phi|\alpha, \beta \bigr) = \iint p_d\bigl( w | \theta, \phi \bigr) \text{Dir}_W(\theta; \alpha) \times \text{Dir}_T(\phi; \beta) d\theta d\phi\,, $$ and $$ p_d\bigl( w | \theta, \phi \bigr) = \sum_{z \in T} p_d( w, z |\theta, \phi ) = \sum_{z \in T} p_d( w | z, \theta, \phi ) p_d( z | \theta, \phi ) = \sum_{z \in T} \theta_{zw} p_d( z | \phi ) = \sum_{z \in T} \theta_{zw} \phi_{dz} \,, $$ for $\theta=(\theta_t)_{t\in T}$ and $\phi = (\phi_d)_{d\in D}$.
In Latent Semantic Analysis $$ L( D |\theta, \phi ) = \prod_{d\in D} \prod_{i\in d} p_d( w_{di} |\theta,\phi ) \,, $$ with $p_d(\cdot)$ being the terms distribution in a particular documnet $d\in D$. The log-likelihood is $$ l(D|\theta,\phi) = \sum_{d\in D} \sum_{i\in d} \log \sum_{z_{di}\in T} p_d( w_{di}, z_{di} |\theta,\phi ) \,,$$ since each word comes from a mixture of topic distributions, with the mixture component determined by $z_{di}$.
If the latent topic of each words were known, then the log-likelihood would be: $$ l(D, Z|\theta,\phi) = \sum_{d\in D} \sum_{i\in d} \log \theta_{z_{di}w_{di}} + \sum_{d\in D} \sum_{i\in d} \log \phi_{d\,z_{di}} \,,$$ which in a more analytically-friendly notation would look like: $$ l(D, Z|\theta,\phi) = \sum_{d\in D} \sum_{i\in d} \sum_{t\in T} \sum_{v\in W} 1_{t=z_{di}} 1_{v=w_{di}} \log \theta_{tw} + \sum_{d\in D} \sum_{i\in d} \sum_{t\in T} 1_{t=z_{di}} \log \phi_{dt} \,,$$ whence $$ l(D, Z|\theta,\phi) = \sum_{t\in T} \sum_{v\in W} \log \theta_{tw} \sum_{d\in D} \sum_{i\in d} 1_{t=z_{di}} 1_{v=w_{di}} + \sum_{t\in T} \sum_{c\in D} \log \phi_{ct} \sum_{d\in D} \sum_{i\in d} 1_{t=z_{di}} 1_{c=d} \,. $$
where using Bayes formula $$ p_d(z|w) = \frac{p_d(w,z)}{ p_d(w) } = \frac{p_d(w,z)}{ \sum_{z\in T} p_d(w,z) } = \frac{p_d(w|z)p_d(z)}{ \sum_{z\in T} p_d(w,z) } = \frac{ \theta_{zw} \phi_{dz}}{ \sum_{t\in T} \theta_{tw} \phi_{dt} }\,, $$
Let's have a look at the topics uncovered by the LDA represented by the most likely words.
In [ ]:
for p in range( 10 ) :
for t in range( 20, 25 ) :
print model.show_topic(t)[ p ][ 1 ].center( 20, ' ' ),
print
Sadly, they do readily lend themselves as topic keywords.
In [ ]:
model.show_topic(1)
In [ ]: